The world cup 2022 will be hosted by Qatar in 2022. We wanted to try and predict which country has the biggest probability of winning the tournament by using supervised learning.

1. Data

The data that we will be using is already cleaned and does not require pre-processing.

The data set provides a complete overview of all international football matches played since the 90s. On top of that, the strength of each team is provided by incorporating the FIFA rankings as well as player strengths based on the EA Sport FIFA video game. It is available on kaggle

Variables

  • date : date of the match
  • home_team : name of the home team
  • away_team : name of the away team
  • home_team_continent : continent of the home team
  • away_team_continent : continent of the away team
  • home_team_fifa_rank : FIFA rank of the home team when the match took place
  • away_team_fifa_rank : FIFA rank of the away team when the match took place
  • home_team_total_fifa_points : total number of FIFA points of the home team at the time of the match
  • away_team_total_fifa_points : total number of FIFA points of the away team at the time of the match
  • home_team_score : full-time home score (excluding penalty shootout)
  • away_team_score : full-time away score (excluding penalty shootout)
  • tournament : name of tournament
  • city : name of the city where the match was played
  • country : name of the country where the match was played
  • neutral_location :
    • TRUE : the match was played at a neutral venue
  • shoot_out:
    • TRUE : the match included a penalty shootout
  • home_team_result : result of the home team (including penalty shootout)
  • home_team_goalkeeper_score : FIFA game score of the highest ranked GK of the home team
  • away_team_goalkeeper_score : FIFA game score of the highest ranked GK of the away team
  • home_team_mean_defense_score : Average FIFA game score of the 4 highest ranked defensive players of the home team
  • away_team_mean_defense_score : Average FIFA game score of the 4 highest ranked defensive players of the away team
  • home_team_mean_midfield_score : Average FIFA game score of the 4 highest ranked midfield players of the home team
  • away_team_mean_midfield_score : Average FIFA game score of the 4 highest ranked midfield players of the away team
  • home_team_mean_offense_score : Average FIFA game score of the 3 highest ranked attacking players of the home team, including wing players
  • away_team_mean_offense_score : Average FIFA game score of the 3 highest ranked attacking players of the away team, including wing players

2. EDA

# class of each variable
spec(input_data)
cols(
  date = col_date(format = ""),
  home_team = col_character(),
  away_team = col_character(),
  home_team_continent = col_character(),
  away_team_continent = col_character(),
  home_team_fifa_rank = col_double(),
  away_team_fifa_rank = col_double(),
  home_team_total_fifa_points = col_double(),
  away_team_total_fifa_points = col_double(),
  home_team_score = col_double(),
  away_team_score = col_double(),
  tournament = col_character(),
  city = col_character(),
  country = col_character(),
  neutral_location = col_logical(),
  shoot_out = col_character(),
  home_team_result = col_character(),
  home_team_goalkeeper_score = col_double(),
  away_team_goalkeeper_score = col_double(),
  home_team_mean_defense_score = col_double(),
  home_team_mean_offense_score = col_double(),
  home_team_mean_midfield_score = col_double(),
  away_team_mean_defense_score = col_double(),
  away_team_mean_offense_score = col_double(),
  away_team_mean_midfield_score = col_double()
)
# summary
skim_without_charts(input_data)
── Data Summary ────────────────────────
                           Values    
Name                       input_data
Number of rows             23921     
Number of columns          25        
_______________________              
Column type frequency:               
  character                9         
  Date                     1         
  logical                  1         
  numeric                  14        
________________________             
Group variables            None      

Missing data

input_data %>%
  summarise_all(list(~is.na(.)))%>%
  pivot_longer(everything(),
               names_to = "variables", values_to="missing") %>%
  count(variables, missing) %>%
  ggplot(aes(y=variables,x=n,fill=missing))+
  geom_col()+
  scale_fill_manual(values=c("#A3BE8C","#EBCB8B"))+
  theme(axis.title.y=element_blank())

Top 10 teams in 2022

# Get the ranking of all home teams
home <-
  input_data %>% 
  select(date, home_team, home_team_fifa_rank) %>% 
  rename(team = home_team, ranking = home_team_fifa_rank)

# Get the ranking of all away teams
away <-
  input_data %>% 
  select(date, away_team, away_team_fifa_rank) %>% 
  rename(team = away_team, ranking = away_team_fifa_rank)

# Combine both data frames into one
fifa_ranking <- rbind(home, away)

# Get the latest ranking of each country based on their most recent match
latest_fifa_ranking <-
  fifa_ranking %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  select(-row_number, -date) %>% 
  arrange(ranking)
  
head(latest_fifa_ranking, 10)

FIFA rankings over time

top5_list <- head(latest_fifa_ranking, 5)$team

top5_ranking <-
  fifa_ranking  %>% 
  filter(team %in% top5_list)

p <-
  ggplot(data = top5_ranking,
         mapping = aes(
           x = date,
           y = ranking,
           group = team,
           color = team
         )) +
  geom_line() +
  scale_y_reverse() +
  labs(
    x = "Date",
    y = "FIFA Ranking",
    color = "Team",
    title = "FIFA Rankings of the 2022 Top 5 teams"
  )

ggplotly(p)
NA

Teams with strongest GK

# Gather goalkeeper data from matches
gk_home <-
  input_data %>% 
  select(date, home_team, home_team_goalkeeper_score) %>% 
  rename(team = home_team, goalkeeper_rating = home_team_goalkeeper_score)

gk_away <-
  input_data %>% 
  select(date, away_team, away_team_goalkeeper_score) %>% 
  rename(team = away_team, goalkeeper_rating = away_team_goalkeeper_score)

gk_rating <- drop_na(rbind(gk_home, gk_away))

# Get latest rating of each team's goalkeeper and show top 10
latest_gk_rating <-
  gk_rating %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  select(-row_number, -date) %>% 
  arrange(-goalkeeper_rating)

ggplot(data = head(latest_gk_rating, 10), mapping = aes(x=goalkeeper_rating, y=reorder(team, goalkeeper_rating), label=goalkeeper_rating)) +
  geom_col(fill="#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 teams with the strongest goalkeeper",
       subtitle = "Based on the highest rated goalkeeper of each team",
       x="Goalkeeper Rating",
       y="Country")

Teams with strongest defense

# Gather goalkeeper and defense data from matches
def_home <-
  input_data %>% 
  select(date, home_team, home_team_goalkeeper_score, home_team_mean_defense_score) %>% 
  rename(team = home_team, goalkeeper_rating = home_team_goalkeeper_score, mean_defense_rating = home_team_mean_defense_score)

def_away <-
  input_data %>% 
  select(date, away_team, away_team_goalkeeper_score, away_team_mean_defense_score) %>% 
  rename(team = away_team, goalkeeper_rating = away_team_goalkeeper_score, mean_defense_rating = away_team_mean_defense_score)

def_rating <- drop_na(rbind(def_home, def_away))

# Get latest combined rating of each team and show top 10
latest_def_rating <-
  def_rating %>% 
  arrange(team, desc(date)) %>% 
  mutate(total_def = goalkeeper_rating + mean_defense_rating) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number==1) %>% 
  arrange(-total_def) %>% 
  select(-row_number, -date)

ggplot(data = head(latest_def_rating, 10), mapping=aes(x=total_def, y=reorder(team, total_def), label=total_def)) + 
  geom_col(fill="#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 teams with the strongest defense",
       subtitle = "Based on goalkeeper and mean defense ratings",
       x = "Total Defense Rating",
       y = "Teams") 

Teams with strongest midfield

mid_home <-
  input_data %>% 
  select(date, home_team, home_team_mean_midfield_score) %>% 
  rename(team = home_team, midfield_rating = home_team_mean_midfield_score)

mid_away <-
  input_data %>% 
  select(date, away_team, away_team_mean_midfield_score) %>% 
  rename(team = away_team, midfield_rating = away_team_mean_midfield_score)

mid_rating <- drop_na(rbind(mid_home, mid_away))

# Get latest midfield rating of each team and show top 10
latest_mid_rating <-
  mid_rating %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  arrange(-midfield_rating) %>% 
  select(-date, -row_number)

ggplot(data = head(latest_mid_rating, 10), mapping=aes(x=midfield_rating, y=reorder(team, midfield_rating), label=midfield_rating)) + 
  geom_col(fill= "#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 teams with the strongest midfield",
       subtitle = "Based on the average rating of the 4 highest rated midfield players of each team",
       x = "Midfield Rating",
       y = "Teams")

Teams with strongest offense

off_home <-
  input_data %>% 
  select(date, home_team, home_team_mean_offense_score) %>% 
  rename(team = home_team, offense_rating = home_team_mean_offense_score)

off_away <-
  input_data %>% 
  select(date, away_team, away_team_mean_offense_score) %>% 
  rename(team = away_team, offense_rating = away_team_mean_offense_score)

off_rating <- drop_na(rbind(off_home, off_away))

# Get latest offense rating of each team and show top 10
latest_off_rating <-
  off_rating %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  arrange(-offense_rating) %>% 
  select(-date, -row_number)

ggplot(data = head(latest_off_rating, 10), mapping=aes(x=offense_rating, y=reorder(team, offense_rating), label=offense_rating)) +
  geom_col(fill="#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title="Top 10 teams with the strongest offense",
      subtitle="Based on the average rating of the 3 highest rated offensive players of each team",
      x="Offense Rating",
      y="Teams")

Is it better to play at home ?

home_team_advantage <-
  input_data %>% 
  filter(neutral_location == FALSE) %>% 
  count(home_team_result) %>% 
  mutate(percentage = label_percent()(n/sum(n)))

ggplot(data = home_team_advantage, mapping=aes(x="", y=n, fill=home_team_result)) +
  geom_bar(width = 1, stat = "identity", color="white") +
  coord_polar("y") +
  scale_fill_manual(values = c("#EBCB8B", "#BF616A",
                               "#A3BE8C")) +
  theme_void() +
  labs(title = "Distribution of match results of home teams",
       subtitle = "Excluding matches played at neutral locations",
       fill="Result")

Correlation Matrix

# select numeric columns only
input_numeric_data <- input_data %>%
  select_if(is.numeric) %>%
  drop_na()

# rename variables for easier correlation plot visualization
input_numeric_data <- input_numeric_data %>% rename(
  rank1 = home_team_fifa_rank,
  rank2 = away_team_fifa_rank,
  total_fifa_points1 = home_team_total_fifa_points,
  total_fifa_points2 = away_team_total_fifa_points,
  score1 = home_team_score,
  score2 = away_team_score,
  gk_score1 = home_team_goalkeeper_score,
  gk_score2 = away_team_goalkeeper_score,
  df_score1 = home_team_mean_defense_score,
  df_score2 = away_team_mean_defense_score,
  att_score1 = home_team_mean_offense_score,
  att_score2 = away_team_mean_offense_score,
  mf_score1 = home_team_mean_midfield_score,
  mf_score2 = away_team_mean_midfield_score
)

# create correlation plot
input_numeric_data %>%
  cor() %>%
  corrplot(
    type = "upper",
    diag = FALSE,
    col=colorRampPalette(c("firebrick","lightyellow","green4"))(100),
    method = "shade",
    shade.col = NA,
    tl.col = "black",
    tl.srt = 45
  )

2. Data Processing / Feature Engineering

Create new features

output_data$win <- output_data$score_difference > 0
Warning: Unknown or uninitialised column: `score_difference`.
Error:
! Assigned data `output_data$score_difference > 0` must be compatible with existing data.
✖ Existing data has 23921 rows.
✖ Assigned data has 0 rows.
ℹ Only vectors of size 1 are recycled.
Backtrace:
  1. base::`$<-`(`*tmp*`, win, value = `<lgl>`)
 12. tibble (local) `<fn>`(`<vctrs___>`)

Model

# create training and test set

sample <- sample(c(TRUE, FALSE), nrow(output_data), replace=TRUE, prob=c(0.7,0.3))
train <- output_data[sample, ]
test <- output_data[!sample, ]

# fit logistic regression model
logreg <- glm(win ~ average_rank + rank_diff + point_diff, family = "binomial", data = train)
summary(logreg)

Call:
glm(formula = win ~ average_rank + rank_diff + point_diff, family = "binomial", 
    data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.6642  -1.0028  -0.3857   1.0260   2.5995  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.2726775  0.0330011  -8.263  < 2e-16 ***
average_rank  0.0018669  0.0003617   5.161 2.46e-07 ***
rank_diff    -0.0196047  0.0004886 -40.120  < 2e-16 ***
point_diff    0.0003186  0.0001322   2.410    0.016 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 23081  on 16657  degrees of freedom
Residual deviance: 19729  on 16654  degrees of freedom
AIC: 19737

Number of Fisher Scoring iterations: 4
# calc probability of win for each team in test dataset
predicted <- predict(logreg, test, type="response")


# convert wins to 1 and 0
test$win <- ifelse(test$win==TRUE, 1, 0)

# find optimal cutoff probability to use to maximize accuracy
optimal <- optimalCutoff(test$win, predicted)[1]
optimal
[1] 0.5171179
# confusion matrix
confusionMatrix(test$win, predicted)

# calculate miss classification error rate
misClassError(test$win, predicted, threshold = optimal)
[1] 0.3192
# ROC
plotROC(test$win, predicted)

Test

index(wc_rankings_away)
Error in index(wc_rankings_away) : could not find function "index"
    row <- data.frame(matrix(nrow = 0, ncol = length(colnames(test)))) %>%
      colnames(row) <- colnames(test)
Error in data.frame(matrix(nrow = 0, ncol = length(colnames(test)))) %>%  : 
  target of assignment expands to non-language object
---
title: "World Cup 2022"
output:
  html_notebook: default
  pdf_document: default
---

```{r, echo=FALSE}
library(dplyr)
library(skimr)
library(plotly)
library(scales)
library(corrplot)
library(tidyverse)
library(gganimate)
library(InformationValue)
```

The world cup 2022 will be hosted by Qatar in 2022. We wanted to try and predict
which country has the biggest probability of winning the tournament by using supervised learning.

# 1. Data

The data that we will be using is already cleaned and does not require pre-processing.

The data set provides a complete overview of all international football matches played since the 90s. On top of that, the strength of each team is provided by incorporating the FIFA rankings as well as player strengths based on the EA Sport FIFA video game. It is available on [kaggle](https://www.kaggle.com/datasets/brenda89/fifa-world-cup-2022)

## Variables

- `date` : date of the match
- `home_team` : name of the home team
- `away_team` : name of the away team
- `home_team_continent` : continent of the home team
- `away_team_continent` : continent of the away team
- `home_team_fifa_rank` : FIFA rank of the home team when the match took place
- `away_team_fifa_rank` : FIFA rank of the away team when the match took place
- `home_team_total_fifa_points` : total number of FIFA points of the home team at the time of the match
- `away_team_total_fifa_points` : total number of FIFA points of the away team at the time of the match
- `home_team_score` : full-time home score (excluding penalty shootout)
- `away_team_score` : full-time away score (excluding penalty shootout)
- `tournament` : name of tournament
- `city` : name of the city where the match was played
- `country` : name of the country where the match was played
- `neutral_location` :
  - `TRUE` : the match was played at a neutral venue
- `shoot_out`:
  - `TRUE` : the match included a penalty shootout
- `home_team_result` : result of the home team (including penalty shootout)
- `home_team_goalkeeper_score` : FIFA game score of the highest ranked GK of the home team
- `away_team_goalkeeper_score` : FIFA game score of the highest ranked GK of the away team
- `home_team_mean_defense_score` : Average FIFA game score of the 4 highest ranked defensive players of the home team
- `away_team_mean_defense_score` : Average FIFA game score of the 4 highest ranked 
defensive players of the away team
- `home_team_mean_midfield_score` : Average FIFA game score of the 4 highest ranked midfield players of the home team
- `away_team_mean_midfield_score` : Average FIFA game score of the 4 highest ranked midfield players of the away team
- `home_team_mean_offense_score` : Average FIFA game score of the 3 highest ranked attacking players of the home team, including wing players
- `away_team_mean_offense_score` : Average FIFA game score of the 3 highest ranked attacking players of the away team, including wing players

# 2. EDA

```{r}
# class of each variable
spec(input_data)
```
```{r}
# summary
skim_without_charts(input_data)
```

## Missing data

```{r}
input_data %>%
  summarise_all(list(~is.na(.)))%>%
  pivot_longer(everything(),
               names_to = "variables", values_to="missing") %>%
  count(variables, missing) %>%
  ggplot(aes(y=variables,x=n,fill=missing))+
  geom_col()+
  scale_fill_manual(values=c("#A3BE8C","#EBCB8B"))+
  theme(axis.title.y=element_blank())
```


## Top 10 teams in 2022

```{r}
# Get the ranking of all home teams
home <-
  input_data %>% 
  select(date, home_team, home_team_fifa_rank) %>% 
  rename(team = home_team, ranking = home_team_fifa_rank)

# Get the ranking of all away teams
away <-
  input_data %>% 
  select(date, away_team, away_team_fifa_rank) %>% 
  rename(team = away_team, ranking = away_team_fifa_rank)

# Combine both data frames into one
fifa_ranking <- rbind(home, away)

# Get the latest ranking of each country based on their most recent match
latest_fifa_ranking <-
  fifa_ranking %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  select(-row_number, -date) %>% 
  arrange(ranking)
  
head(latest_fifa_ranking, 10)
```

## FIFA rankings over time

```{r}
top5_list <- head(latest_fifa_ranking, 5)$team

top5_ranking <-
  fifa_ranking  %>% 
  filter(team %in% top5_list)

p <-
  ggplot(data = top5_ranking,
         mapping = aes(
           x = date,
           y = ranking,
           group = team,
           color = team
         )) +
  geom_line() +
  scale_y_reverse() +
  labs(
    x = "Date",
    y = "FIFA Ranking",
    color = "Team",
    title = "FIFA Rankings of the 2022 Top 5 teams"
  )

ggplotly(p)

```

## Teams with strongest GK

```{r}
# Gather goalkeeper data from matches
gk_home <-
  input_data %>% 
  select(date, home_team, home_team_goalkeeper_score) %>% 
  rename(team = home_team, goalkeeper_rating = home_team_goalkeeper_score)

gk_away <-
  input_data %>% 
  select(date, away_team, away_team_goalkeeper_score) %>% 
  rename(team = away_team, goalkeeper_rating = away_team_goalkeeper_score)

gk_rating <- drop_na(rbind(gk_home, gk_away))

# Get latest rating of each team's goalkeeper and show top 10
latest_gk_rating <-
  gk_rating %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  select(-row_number, -date) %>% 
  arrange(-goalkeeper_rating)

ggplot(data = head(latest_gk_rating, 10), mapping = aes(x=goalkeeper_rating, y=reorder(team, goalkeeper_rating), label=goalkeeper_rating)) +
  geom_col(fill="#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 teams with the strongest goalkeeper",
       subtitle = "Based on the highest rated goalkeeper of each team",
       x="Goalkeeper Rating",
       y="Country")
```
### Teams with strongest defense

```{r}
# Gather goalkeeper and defense data from matches
def_home <-
  input_data %>% 
  select(date, home_team, home_team_goalkeeper_score, home_team_mean_defense_score) %>% 
  rename(team = home_team, goalkeeper_rating = home_team_goalkeeper_score, mean_defense_rating = home_team_mean_defense_score)

def_away <-
  input_data %>% 
  select(date, away_team, away_team_goalkeeper_score, away_team_mean_defense_score) %>% 
  rename(team = away_team, goalkeeper_rating = away_team_goalkeeper_score, mean_defense_rating = away_team_mean_defense_score)

def_rating <- drop_na(rbind(def_home, def_away))

# Get latest combined rating of each team and show top 10
latest_def_rating <-
  def_rating %>% 
  arrange(team, desc(date)) %>% 
  mutate(total_def = goalkeeper_rating + mean_defense_rating) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number==1) %>% 
  arrange(-total_def) %>% 
  select(-row_number, -date)

ggplot(data = head(latest_def_rating, 10), mapping=aes(x=total_def, y=reorder(team, total_def), label=total_def)) + 
  geom_col(fill="#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 teams with the strongest defense",
       subtitle = "Based on goalkeeper and mean defense ratings",
       x = "Total Defense Rating",
       y = "Teams") 
```

### Teams with strongest midfield

```{r}
mid_home <-
  input_data %>% 
  select(date, home_team, home_team_mean_midfield_score) %>% 
  rename(team = home_team, midfield_rating = home_team_mean_midfield_score)

mid_away <-
  input_data %>% 
  select(date, away_team, away_team_mean_midfield_score) %>% 
  rename(team = away_team, midfield_rating = away_team_mean_midfield_score)

mid_rating <- drop_na(rbind(mid_home, mid_away))

# Get latest midfield rating of each team and show top 10
latest_mid_rating <-
  mid_rating %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  arrange(-midfield_rating) %>% 
  select(-date, -row_number)

ggplot(data = head(latest_mid_rating, 10), mapping=aes(x=midfield_rating, y=reorder(team, midfield_rating), label=midfield_rating)) + 
  geom_col(fill= "#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 teams with the strongest midfield",
       subtitle = "Based on the average rating of the 4 highest rated midfield players of each team",
       x = "Midfield Rating",
       y = "Teams")
```

### Teams with strongest offense

```{r}
off_home <-
  input_data %>% 
  select(date, home_team, home_team_mean_offense_score) %>% 
  rename(team = home_team, offense_rating = home_team_mean_offense_score)

off_away <-
  input_data %>% 
  select(date, away_team, away_team_mean_offense_score) %>% 
  rename(team = away_team, offense_rating = away_team_mean_offense_score)

off_rating <- drop_na(rbind(off_home, off_away))

# Get latest offense rating of each team and show top 10
latest_off_rating <-
  off_rating %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  arrange(-offense_rating) %>% 
  select(-date, -row_number)

ggplot(data = head(latest_off_rating, 10), mapping=aes(x=offense_rating, y=reorder(team, offense_rating), label=offense_rating)) +
  geom_col(fill="#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title="Top 10 teams with the strongest offense",
      subtitle="Based on the average rating of the 3 highest rated offensive players of each team",
      x="Offense Rating",
      y="Teams")
```

## Is it better to play at home ?

```{r}
home_team_advantage <-
  input_data %>% 
  filter(neutral_location == FALSE) %>% 
  count(home_team_result) %>% 
  mutate(percentage = label_percent()(n/sum(n)))

ggplot(data = home_team_advantage, mapping=aes(x="", y=n, fill=home_team_result)) +
  geom_bar(width = 1, stat = "identity", color="white") +
  coord_polar("y") +
  scale_fill_manual(values = c("#EBCB8B", "#BF616A",
                               "#A3BE8C")) +
  theme_void() +
  labs(title = "Distribution of match results of home teams",
       subtitle = "Excluding matches played at neutral locations",
       fill="Result")
```

## Correlation Matrix

```{r}
# select numeric columns only
input_numeric_data <- input_data %>%
  select_if(is.numeric) %>%
  drop_na()

# rename variables for easier correlation plot visualization
input_numeric_data <- input_numeric_data %>% rename(
  rank1 = home_team_fifa_rank,
  rank2 = away_team_fifa_rank,
  total_fifa_points1 = home_team_total_fifa_points,
  total_fifa_points2 = away_team_total_fifa_points,
  score1 = home_team_score,
  score2 = away_team_score,
  gk_score1 = home_team_goalkeeper_score,
  gk_score2 = away_team_goalkeeper_score,
  df_score1 = home_team_mean_defense_score,
  df_score2 = away_team_mean_defense_score,
  att_score1 = home_team_mean_offense_score,
  att_score2 = away_team_mean_offense_score,
  mf_score1 = home_team_mean_midfield_score,
  mf_score2 = away_team_mean_midfield_score
)

# create correlation plot
input_numeric_data %>%
  cor() %>%
  corrplot(
    type = "upper",
    diag = FALSE,
    col=colorRampPalette(c("firebrick","lightyellow","green4"))(100),
    method = "shade",
    shade.col = NA,
    tl.col = "black",
    tl.srt = 45
  )
```

# 2. Data Processing / Feature Engineering

## Create new features

```{r}
output_data <- input_data

output_data$rank_diff <- output_data$home_team_fifa_rank - output_data$away_team_fifa_rank

output_data$average_rank <- (output_data$home_team_fifa_rank + output_data$away_team_fifa_rank)/2

output_data$point_diff <- output_data$home_team_total_fifa_points - output_data$away_team_total_fifa_points

output_data$score_diff <- output_data$home_team_score - output_data$away_team_score

output_data$win <- output_data$score_diff > 0

output_data$stake <- output_data$tournament != 'Friendly'
```

## Model

```{r}
# create training and test set
sample <-
  sample(c(TRUE, FALSE),
         nrow(output_data),
         replace = TRUE,
         prob = c(0.7, 0.3))
train <- output_data[sample,]
test <- output_data[!sample,]

# fit logistic regression model
logreg <-
  glm(win ~ average_rank + rank_diff + point_diff,
      family = "binomial",
      data = train)
summary(logreg)

# calc probability of win for each team in test dataset
predicted <- predict(logreg, test, type = "response")


# convert wins to 1 and 0
test$win <- ifelse(test$win == TRUE, 1, 0)

# find optimal cutoff probability to use to maximize accuracy
optimal <- optimalCutoff(test$win, predicted)[1]
optimal

# confusion matrix
confusionMatrix(test$win, predicted)

# calculate miss classification error rate
misClassError(test$win, predicted, threshold = optimal)

# ROC
plotROC(test$win, predicted)

```

## Test

```{r}
wc_teams <- list('Qatar', 'Ecuador', 'Senegal', 'Netherlands', 'England', 'Iran', 'USA',
                  'Wales', 'Argentina', 'Saudi Arabia', 'Mexico', 'Poland', 'France', 
                  'Australia', 'Denmark', 'Tunisia', 'Spain', 'Costa Rica', 'Germany', 
                  'Japan', 'Belgium', 'Canada', 'Morocco', 'Croatia', 'Brazil', 'Serbia', 
                  'Switzerland', 'Cameroon', 'Portugal', 'Ghana', 'Uruguay', 'South Korea')

wc_rankings_home <- output_data %>% 
  filter(date>"2013-01-01") %>% 
  select(home_team, home_team_fifa_rank, home_team_total_fifa_points) %>% 
  filter(home_team %in% wc_teams)
  
wc_rankings_away <- output_data %>% 
  filter(date>"2013-01-01") %>% 
  select(away_team, away_team_fifa_rank, away_team_total_fifa_points) %>% 
  filter(away_team %in% wc_teams)
```

```{r}
# prepare lists
simulation_winners <- list()
simulation_results_winners <- list()
simulation_results_round16 <- list()
simulation_df_round16 <- list()
simulation_results_quarterfina <- list()
simulation_df_quarterfinal <- list()
simulation_results_semifina <- list()
simulation_df_semifinal <- list()

# simulations
n = 1000

# select who will come out of the group stages
candidates <-
  list(
    'Senegal',
    'Netherlands',
    'England',
    'USA',
    'Argentina',
    'Poland',
    'France',
    'Denmark',
    'Spain',
    'Germany',
    'Belgium',
    'Croatia',
    'Brazil',
    'Serbia',
    'Portugal',
    'Uruguay'
  )
finals = list('round_of_16', 'quarterfinal', 'semifinal', 'final')

# simulate

for (f in finals){
  iterations <- length(candidates)/2
  winners = list()
  prob = list()
  
  for (i in range(iterations)){
    home <- candidates[i*2]
    away <- candidates[i*2+1]
    
    row <- data.frame(matrix(nrow = 0, ncol = length(colnames(test))))
    colnames(row) <- colnames(test)
    
    home_rank <- wc_rankings_home %>% filter
    
    
    
    
  }
}
  








```


















